This document runs the sample representativity analyses reported in the supplement of the paper “Climate Change Engagement of Scientists”.
The combined_data.RDS file includes both the Scopus data
and the survey data. We cannot make this data set publicly available, as
it links the participants of our survey with data from Scopus, which
includes exact citation counts, number of co-authors, first publication,
last publication, h-index, etc. This would make it more likely that
participants could be identified.
library(gt)
library(dplyr)
library(ggpubr)
library(tidyverse)
library(gtsummary)
# Constants for Plots and Tables
default_font_color <- '#444444'
default_background_color <- 'white'
default_font_family <- 'Helvetica'
default_font_size <- 15
default_na_col <- '#E22030'
default_plotly_height <- 550
default_plotly_width <- 800
colors <- c('#FC4E07','#00AFBB', '#E7B800', '#2cc990', '#E090DF', '#a0c4ff')
theme_minimal2 <- theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
# Convert haven_labelled columns to numeric
convert_to_numeric <- function(x) {
if (inherits(x, 'haven_labelled')) {
return(as.numeric(haven::zap_labels(x)))
}
return(x)
}
dat_all <- readRDS('../data/combined_data.RDS')
dat_all <- dat_all %>% mutate(across(everything(), convert_to_numeric))
# Replace -99 values with NA
dat_all[dat_all== -99] <- NA
dat_all <- dat_all %>%
mutate(
Survey_Finished = case_when(
Progress >= 100 ~ T, # People who finished the survey
Progress < 100 ~ F, # People who did not finish the survey
is.na(Progress) ~ F) # People who didn't start the survey
) %>%
select(
ResponseId, H.Index, Number.of.Documents, Cited.By, Citation.Count,
Co.Author.Count, First.Publication, Last.Publication, Continent,
Time_Zone, Email_Wave, Survey_Finished, Progress, SurveySource
)
# Make factors out of variables
factor_cols <- c(
'ResponseId', 'Continent', 'Time_Zone',
'Email_Wave', 'Survey_Finished', 'SurveySource'
)
dat_all[factor_cols] <- lapply(dat_all[factor_cols], as_factor)
# Create data.frame for comparison
comp <- rbind(
transform(dat_all, Sample = 'Total Sample'),
transform(dat_all[dat_all$Survey_Finished == T,], Sample = 'Survey')
)
The following section compares the total sample of invited scientists (N = 249,876) to the sample that actually participated in our study (n = 9,220).
This table shows the share of scientists for each continent for our Total Sample and the scientists that completed our survey. It can be observed that European scientists are considerably over-represented, whereas Asian scientists are clearly underrepresented. Scientists based in North America, South America, Oceania and Africa are only slightly underrepresented in our survey. Note that we do not have country / continent information for each participant.
# Table Continent
# Change unknown Continent to NA
levels(comp$Continent)[7] <- NA
t <- tbl_cross(comp,
row = Continent,
col = Sample,
percent = 'col',
missing = 'no',
label = list(Sample ~ 'Sample Comparison',
Continent ~ 'Continent')) %>%
modify_column_hide(columns = stat_0) %>% # Removes Total Column
bold_labels()
t
| Sample Comparison | ||
|---|---|---|
| Survey | Total Sample | |
| Continent | ||
| Asia | 1,072 (12%) | 93,958 (38%) |
| Europe | 4,691 (51%) | 77,916 (31%) |
| North America | 2,559 (28%) | 63,365 (25%) |
| Oceania | 470 (5.1%) | 8,189 (3.3%) |
| South America | 287 (3.1%) | 3,808 (1.5%) |
| Africa | 110 (1.2%) | 2,017 (0.8%) |
| Total | 9,189 (100%) | 249,253 (100%) |
# Refactor Sample so that Survey Population will be in front
comp$Sample <- factor(comp$Sample, levels = c('Total Sample', 'Survey'))
# Year of First publication
p_year <- comp %>%
ggplot(aes(x = First.Publication, fill = Sample, col = Sample)) +
geom_density(alpha = 0.6) +
# Add Median for total Sample
geom_vline(data = comp[comp$Sample == 'Total Sample',] ,
aes(xintercept = median(First.Publication, na.rm = T)),
linetype = 'longdash',
color = colors[1]) +
# Add Median for people who finished the survey
geom_vline(data = comp[comp$Sample == 'Survey',] ,
aes(xintercept = median(First.Publication, na.rm = T)),
linetype = 'dashed',
color = colors[2]) +
scale_fill_manual(values = colors[1:2])+
scale_color_manual(values = colors[1:2]) +
scale_x_continuous(breaks = seq(1960,2025,5),
limits = c(1960, 2025)) +
labs(title = 'Sample Comparison',
subtitle = 'Lines indicate the Median',
x = 'Year of first publication') +
theme_minimal2
p_year
# H-Index
p_hindex <- comp %>%
ggplot(aes(x = H.Index, fill = Sample, col = Sample)) +
geom_density(alpha = 0.6) +
# Add Median for total Sample
geom_vline(data = comp[comp$Sample == 'Total Sample',] ,
aes(xintercept = median(H.Index, na.rm = T)),
linetype = 'dashed',
color = colors[1]) +
# Add Median for people who finished the survey
geom_vline(data = comp[comp$Sample == 'Survey',] ,
aes(xintercept = median(H.Index, na.rm = T)),
linetype = 'dashed',
color = colors[2]) +
scale_fill_manual(values = colors[1:2])+
scale_color_manual(values = colors[1:2]) +
scale_x_continuous(breaks = seq(0,125,10),
limits = c(0, 125)) +
labs(title='Sample Comparison',
subtitle = 'Lines indicate the Median',
x = 'H-Index') +
theme_minimal2
p_hindex
# Number of Articles
p_articles <- comp %>%
ggplot(aes(x = Number.of.Documents, fill = Sample, col = Sample)) +
geom_density(alpha = 0.6) +
# Add Median for total Sample
geom_vline(data = comp[comp$Sample == 'Total Sample',] ,
aes(xintercept = median(Number.of.Documents, na.rm = T)),
linetype = 'dashed',
color = colors[1]) +
# Add Median for people who finished the survey
geom_vline(data = comp[comp$Sample == 'Survey',] ,
aes(xintercept = median(Number.of.Documents, na.rm = T)),
linetype = 'dashed',
color = colors[2]) +
scale_fill_manual(values = colors[1:2])+
scale_color_manual(values = colors[1:2]) +
scale_x_continuous(breaks = seq(0,400,50),
limits = c(0, 400)) +
labs(title = 'Sample Comparison',
subtitle = 'Lines indicate the Median',
x = 'Number of Documents authored') +
theme_minimal2
p_articles
# Number of Citations
p_citations <- comp %>%
ggplot(aes(x = Citation.Count, fill = Sample, col = Sample)) +
geom_density(alpha = 0.6) +
# Add Median for total Sample
geom_vline(data = comp[comp$Sample == 'Total Sample',] ,
aes(xintercept = median(Citation.Count, na.rm = T)),
linetype = 'dashed',
color = colors[1]) +
# Add Median for people who finished the survey
geom_vline(data = comp[comp$Sample == 'Survey',] ,
aes(xintercept = median(Citation.Count, na.rm = T)),
linetype = 'dashed',
color = colors[2]) +
scale_fill_manual(values = colors[1:2])+
scale_color_manual(values = colors[1:2]) +
scale_x_continuous(breaks = seq(0,6000,500),
limits = c(0, 6000)) +
labs(title = 'Sample Comparison',
subtitle = 'Lines indicate the Median',
x = 'Total Number of Citations') +
theme_minimal2
p_citations
# Citing Authors
p_citedby <- comp %>%
ggplot(aes(x = Cited.By, fill = Sample, col = Sample)) +
geom_density(alpha = 0.6) +
# Add Median for total Sample
geom_vline(data = comp[comp$Sample == 'Total Sample',] ,
aes(xintercept = median(Cited.By, na.rm = T)),
linetype = 'dashed',
color = colors[1]) +
# Add Median for people who finished the survey
geom_vline(data = comp[comp$Sample == 'Survey',] ,
aes(xintercept = median(Cited.By, na.rm = T)),
linetype = 'dashed',
color = colors[2]) +
scale_fill_manual(values = colors[1:2])+
scale_color_manual(values = colors[1:2]) +
scale_x_continuous(breaks = seq(0,4000,500),
limits = c(0, 4000)) +
labs(title = 'Sample Comparison',
subtitle = 'Lines indicate the Median',
x = 'Total Number of Citing Authors') +
theme_minimal2
p_citedby
# Overview the Sample
comp$Sample <- factor(comp$Sample, levels = c('Survey', 'Total Sample'))
# Mean
t_mean <- comp %>%
select(Sample, H.Index, Number.of.Documents, Cited.By, Citation.Count, Co.Author.Count, First.Publication,
Continent) %>%
tbl_summary(by = Sample,
missing = 'no',
type = where(is.numeric) ~ 'continuous',
statistic = list(all_categorical() ~ '{n} ({p}%)',
all_continuous() ~ '{mean} ({sd})'))
t_mean
| Characteristic | Survey, N = 9,2201 | Total Sample, N = 249,8761 |
|---|---|---|
| H.Index | 21 (20) | 23 (21) |
| Number.of.Documents | 74 (117) | 89 (124) |
| Cited.By | 2,717 (6,953) | 3,014 (6,860) |
| Citation.Count | 3,774 (10,315) | 4,145 (10,065) |
| Co.Author.Count | 338 (1,136) | 351 (967) |
| First.Publication | 2,006 (12) | 2,006 (11) |
| Continent | ||
| Asia | 1,072 (12%) | 93,958 (38%) |
| Europe | 4,691 (51%) | 77,916 (31%) |
| North America | 2,559 (28%) | 63,365 (25%) |
| Oceania | 470 (5.1%) | 8,189 (3.3%) |
| South America | 287 (3.1%) | 3,808 (1.5%) |
| Africa | 110 (1.2%) | 2,017 (0.8%) |
| 1 Mean (SD); n (%) | ||
# Median
t_med <- comp %>%
select(Sample, H.Index, Number.of.Documents, Cited.By, Citation.Count, Co.Author.Count, First.Publication,
Continent) %>%
tbl_summary(by = Sample,
missing = 'no',
type = where(is.numeric) ~ 'continuous',
statistic = list(all_categorical() ~ '{n} ({p}%)',
all_continuous() ~ '{median} ({sd})'))
t_med
| Characteristic | Survey, N = 9,2201 | Total Sample, N = 249,8761 |
|---|---|---|
| H.Index | 15 (20) | 18 (21) |
| Number.of.Documents | 36 (117) | 49 (124) |
| Cited.By | 710 (6,953) | 1,002 (6,860) |
| Citation.Count | 871 (10,315) | 1,243 (10,065) |
| Co.Author.Count | 97 (1,136) | 125 (967) |
| First.Publication | 2,009 (12) | 2,008 (11) |
| Continent | ||
| Asia | 1,072 (12%) | 93,958 (38%) |
| Europe | 4,691 (51%) | 77,916 (31%) |
| North America | 2,559 (28%) | 63,365 (25%) |
| Oceania | 470 (5.1%) | 8,189 (3.3%) |
| South America | 287 (3.1%) | 3,808 (1.5%) |
| Africa | 110 (1.2%) | 2,017 (0.8%) |
| 1 Median (SD); n (%) | ||